In [60]:

    
%matplotlib inline
from preamble import *

Representing Data and Engineering Features

Categorical Variables

One-Hot-Encoding (Dummy variables)



In [61]:

    
import pandas as pd
# The file has no headers naming the columns, so we pass header=None and provide the column names explicitly in "names"
data = pd.read_csv("data/adult.data", header=None, index_col=False,
                   names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
                          'marital-status', 'occupation', 'relationship', 'race', 'gender',
                          'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
# For illustration purposes, we only select some of the columns:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
# print the first 5 rows
data.head()









    Out[61]:






  
    
      
      age
      workclass
      education
      gender
      hours-per-week
      occupation
      income
    
  
  
    
      0
      39
      State-gov
      Bachelors
      Male
      40
      Adm-clerical
      <=50K
    
    
      1
      50
      Self-emp-not-inc
      Bachelors
      Male
      13
      Exec-managerial
      <=50K
    
    
      2
      38
      Private
      HS-grad
      Male
      40
      Handlers-cleaners
      <=50K
    
    
      3
      53
      Private
      11th
      Male
      40
      Handlers-cleaners
      <=50K
    
    
      4
      28
      Private
      Bachelors
      Female
      40
      Prof-specialty
      <=50K

Checking string-encoded categorical data



In [62]:

    
data.gender.value_counts()









    Out[62]:





 Male      21790
 Female    10771
Name: gender, dtype: int64



In [63]:

    
print("Original features:\n", list(data.columns), "\n")
data_dummies = pd.get_dummies(data)
print("Features after get_dummies:\n", list(data_dummies.columns))









    



Original features:
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 

Features after get_dummies:
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K']



In [64]:

    
data_dummies.head()









    Out[64]:






  
    
      
      age
      hours-per-week
      workclass_ ?
      workclass_ Federal-gov
      workclass_ Local-gov
      workclass_ Never-worked
      workclass_ Private
      workclass_ Self-emp-inc
      workclass_ Self-emp-not-inc
      workclass_ State-gov
      ...
      occupation_ Machine-op-inspct
      occupation_ Other-service
      occupation_ Priv-house-serv
      occupation_ Prof-specialty
      occupation_ Protective-serv
      occupation_ Sales
      occupation_ Tech-support
      occupation_ Transport-moving
      income_ <=50K
      income_ >50K
    
  
  
    
      0
      39
      40
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      1
      50
      13
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      2
      38
      40
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      3
      53
      40
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      4
      28
      40
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
  

5 rows × 46 columns



In [65]:

    
# Get only the columns containing features, that is all columns from 'age' to 'occupation_ Transport-moving'
# This range contains all the features but not the target

features = data_dummies.ix[:, 'age':'occupation_ Transport-moving']
# extract numpy arrays
X = features.values
y = data_dummies['income_ >50K'].values
print(X.shape, y.shape)









    



(32561, 44) (32561,)



In [66]:

    
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print(logreg.score(X_test, y_test))









    



0.808745854318

Numbers can encode categoricals



In [67]:

    
# create a dataframe with an integer feature and a categorical string feature
demo_df = pd.DataFrame({'Integer Feature': [0, 1, 2, 1], 'Categorical Feature': ['socks', 'fox', 'socks', 'box']})
demo_df









    Out[67]:






  
    
      
      Categorical Feature
      Integer Feature
    
  
  
    
      0
      socks
      0
    
    
      1
      fox
      1
    
    
      2
      socks
      2
    
    
      3
      box
      1



In [68]:

    
pd.get_dummies(demo_df)









    Out[68]:






  
    
      
      Integer Feature
      Categorical Feature_box
      Categorical Feature_fox
      Categorical Feature_socks
    
  
  
    
      0
      0
      0.0
      0.0
      1.0
    
    
      1
      1
      0.0
      1.0
      0.0
    
    
      2
      2
      0.0
      0.0
      1.0
    
    
      3
      1
      1.0
      0.0
      0.0



In [69]:

    
demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str)
pd.get_dummies(demo_df)









    Out[69]:






  
    
      
      Categorical Feature_box
      Categorical Feature_fox
      Categorical Feature_socks
      Integer Feature_0
      Integer Feature_1
      Integer Feature_2
    
  
  
    
      0
      0.0
      0.0
      1.0
      1.0
      0.0
      0.0
    
    
      1
      0.0
      1.0
      0.0
      0.0
      1.0
      0.0
    
    
      2
      0.0
      0.0
      1.0
      0.0
      0.0
      1.0
    
    
      3
      1.0
      0.0
      0.0
      0.0
      1.0
      0.0

Binning, Discretization, Linear Models and Trees



In [72]:

    
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

X, y = mglearn.datasets.make_wave(n_samples=100)
plt.plot(X[:, 0], y, 'o')
line = np.linspace(-3, 3, 1000)[:-1].reshape(-1, 1)

reg = LinearRegression().fit(X, y)
plt.plot(line, reg.predict(line), label="linear regression")

reg = DecisionTreeRegressor(min_samples_split=3).fit(X, y)
plt.plot(line, reg.predict(line), label="decision tree")
plt.ylabel("regression output")
plt.xlabel("input feature")
plt.legend(loc="best")









    Out[72]:





<matplotlib.legend.Legend at 0x7f7940735f60>



In [73]:

    
np.set_printoptions(precision=2)
bins = np.linspace(-3, 3, 11)
bins









    Out[73]:





array([-3. , -2.4, -1.8, -1.2, -0.6,  0. ,  0.6,  1.2,  1.8,  2.4,  3. ])



In [74]:

    
which_bin = np.digitize(X, bins=bins)
print("\nData points:\n", X[:5])
print("\nBin membership for data points:\n", which_bin[:5])









    



Data points:
 [[-0.75]
 [ 2.7 ]
 [ 1.39]
 [ 0.59]
 [-2.06]]

Bin membership for data points:
 [[ 4]
 [10]
 [ 8]
 [ 6]
 [ 2]]



In [14]:

    
from sklearn.preprocessing import OneHotEncoder
# transform using the OneHotEncoder.
encoder = OneHotEncoder(sparse=False)
# encoder.fit finds the unique values that appear in which_bin
encoder.fit(which_bin)
# transform creates the one-hot encoding
X_binned = encoder.transform(which_bin)
print(X_binned[:5])









    



[[ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]]



In [75]:

    
X_binned.shape









    Out[75]:





(100, 10)



In [76]:

    
line_binned = encoder.transform(np.digitize(line, bins=bins))

plt.plot(X[:, 0], y, 'o')
reg = LinearRegression().fit(X_binned, y)
plt.plot(line, reg.predict(line_binned), label='linear regression binned')

reg = DecisionTreeRegressor(min_samples_split=3).fit(X_binned, y)
plt.plot(line, reg.predict(line_binned), label='decision tree binned')
for bin in bins:
    plt.plot([bin, bin], [-3, 3], ':', c='k')
plt.legend(loc="best")
plt.suptitle("linear_binning")









    Out[76]:





<matplotlib.text.Text at 0x7f794064e208>

Interactions and Polynomials



In [77]:

    
X_combined = np.hstack([X, X_binned])
print(X_combined.shape)



In [78]:

    
plt.plot(X[:, 0], y, 'o')

reg = LinearRegression().fit(X_combined, y)

line_combined = np.hstack([line, line_binned])
plt.plot(line, reg.predict(line_combined), label='linear regression combined')

for bin in bins:
    plt.plot([bin, bin], [-3, 3], ':', c='k')
plt.legend(loc="best")









    Out[78]:





<matplotlib.legend.Legend at 0x7f79405bca90>



In [79]:

    
X_product = np.hstack([X_binned, X * X_binned])
print(X_product.shape)



In [80]:

    
plt.plot(X[:, 0], y, 'o')
    
reg = LinearRegression().fit(X_product, y)

line_product = np.hstack([line_binned, line * line_binned])
plt.plot(line, reg.predict(line_product), label='linear regression combined')

for bin in bins:
    plt.plot([bin, bin], [-3, 3], ':', c='k')
plt.legend(loc="best")









    Out[80]:





<matplotlib.legend.Legend at 0x7f7940596208>



In [81]:

    
from sklearn.preprocessing import PolynomialFeatures

# include polynomials up to x ** 10:
poly = PolynomialFeatures(degree=10)
poly.fit(X)
X_poly = poly.transform(X)



In [82]:

    
X_poly.shape









    Out[82]:





(100, 11)



In [84]:

    
poly.get_feature_names()









    Out[84]:





['1',
 'x0',
 'x0^2',
 'x0^3',
 'x0^4',
 'x0^5',
 'x0^6',
 'x0^7',
 'x0^8',
 'x0^9',
 'x0^10']



In [85]:

    
plt.plot(X[:, 0], y, 'o')
    
reg = LinearRegression().fit(X_poly, y)

line_poly = poly.transform(line)
plt.plot(line, reg.predict(line_poly), label='polynomial linear regression')
plt.legend(loc="best")









    Out[85]:





<matplotlib.legend.Legend at 0x7f79419106a0>



In [86]:

    
from sklearn.svm import SVR
plt.plot(X[:, 0], y, 'o')

for gamma in [1, 10]:
    svr = SVR(gamma=gamma).fit(X, y)
    plt.plot(line, svr.predict(line), label='SVR gamma=%d' % gamma)
    
plt.legend(loc="best")









    Out[86]:





<matplotlib.legend.Legend at 0x7f7941879fd0>



In [87]:

    
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0)

# rescale data:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [88]:

    
poly = PolynomialFeatures(degree=2).fit(X_train_scaled)
X_train_poly = poly.transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)
print(X_train.shape)
print(X_train_poly.shape)









    



(379, 13)
(379, 105)



In [89]:

    
print(poly.get_feature_names())









    



['1', 'x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x0^2', 'x0 x1', 'x0 x2', 'x0 x3', 'x0 x4', 'x0 x5', 'x0 x6', 'x0 x7', 'x0 x8', 'x0 x9', 'x0 x10', 'x0 x11', 'x0 x12', 'x1^2', 'x1 x2', 'x1 x3', 'x1 x4', 'x1 x5', 'x1 x6', 'x1 x7', 'x1 x8', 'x1 x9', 'x1 x10', 'x1 x11', 'x1 x12', 'x2^2', 'x2 x3', 'x2 x4', 'x2 x5', 'x2 x6', 'x2 x7', 'x2 x8', 'x2 x9', 'x2 x10', 'x2 x11', 'x2 x12', 'x3^2', 'x3 x4', 'x3 x5', 'x3 x6', 'x3 x7', 'x3 x8', 'x3 x9', 'x3 x10', 'x3 x11', 'x3 x12', 'x4^2', 'x4 x5', 'x4 x6', 'x4 x7', 'x4 x8', 'x4 x9', 'x4 x10', 'x4 x11', 'x4 x12', 'x5^2', 'x5 x6', 'x5 x7', 'x5 x8', 'x5 x9', 'x5 x10', 'x5 x11', 'x5 x12', 'x6^2', 'x6 x7', 'x6 x8', 'x6 x9', 'x6 x10', 'x6 x11', 'x6 x12', 'x7^2', 'x7 x8', 'x7 x9', 'x7 x10', 'x7 x11', 'x7 x12', 'x8^2', 'x8 x9', 'x8 x10', 'x8 x11', 'x8 x12', 'x9^2', 'x9 x10', 'x9 x11', 'x9 x12', 'x10^2', 'x10 x11', 'x10 x12', 'x11^2', 'x11 x12', 'x12^2']



In [90]:

    
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train_scaled, y_train)
print("score without interactions: %f" % ridge.score(X_test_scaled, y_test))
ridge = Ridge().fit(X_train_poly, y_train)
print("score with interactions: %f" % ridge.score(X_test_poly, y_test))









    



score without interactions: 0.621370
score with interactions: 0.753423



In [91]:

    
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100).fit(X_train_scaled, y_train)
print("score without interactions: %f" % rf.score(X_test_scaled, y_test))
rf = RandomForestRegressor(n_estimators=100).fit(X_train_poly, y_train)
print("score with interactions: %f" % rf.score(X_test_poly, y_test))









    



score without interactions: 0.799778
score with interactions: 0.763339



In [93]:

    
rf.apply(X_test_poly)









    Out[93]:





array([[ 19, 120,  27, ...,  25,  58, 123],
       [  4, 192, 199, ..., 215,   5, 275],
       [144, 132, 254, ..., 204, 138, 224],
       ..., 
       [105, 130, 240, ..., 155,  87, 243],
       [203, 277, 389, ..., 324, 300, 331],
       [414,  85, 155, ..., 171, 408,  73]])



In [94]:

    
rf.apply(X_test_poly).shape









    Out[94]:





(127, 100)

Univariate Non-linear transformations



In [95]:

    
rnd = np.random.RandomState(0)
X_org = rnd.normal(size=(1000, 3))
w = rnd.normal(size=3)

X = np.random.poisson(10 * np.exp(X_org))
y = np.dot(X_org, w)



In [96]:

    
np.bincount(X[:, 0])









    Out[96]:





array([25, 42, 53, 52, 60, 58, 55, 47, 45, 46, 33, 35, 37, 23, 27, 32, 29,
       22, 21, 20, 15, 16,  8, 12, 18, 10, 11, 13,  5,  4,  7,  4,  9,  7,
        8,  1,  1,  5,  3,  4,  6,  1,  7,  1,  4,  3,  2,  1,  3,  4,  3,
        4,  2,  0,  1,  0,  1,  1,  1,  1,  2,  0,  0,  0,  0,  2,  1,  0,
        1,  0,  2,  1,  3,  0,  0,  2,  0,  0,  0,  2,  0,  0,  0,  0,  0,
        2,  0,  0,  3,  0,  0,  0,  1,  2,  0,  0,  1,  0,  0,  1,  0,  0,
        0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1])



In [97]:

    
bins = np.bincount(X[:, 0])
plt.bar(range(len(bins)), bins, color='w')
plt.ylabel("number of appearances")
plt.xlabel("value")









    Out[97]:





<matplotlib.text.Text at 0x7f7941836860>



In [98]:

    
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Ridge().fit(X_train, y_train).score(X_test, y_test)









    Out[98]:





0.58996720228474553



In [99]:

    
X_train_log = np.log(X_train + 1)
X_test_log = np.log(X_test + 1)



In [101]:

    
plt.hist(np.log(X_train_log[:, 0] + 1), bins=25, color='w');



In [102]:

    
Ridge().fit(X_train_log, y_train).score(X_test_log, y_test)









    Out[102]:





0.8807718946371289

Automatic Feature Selection

Univariate statistics



In [103]:

    
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

# get deterministic random numbers
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))
# add noise features to the data
# the first 30 features are from the dataset, the next 50 are noise
X_w_noise = np.hstack([cancer.data, noise])

X_train, X_test, y_train, y_test = train_test_split(
    X_w_noise, cancer.target, random_state=0, test_size=.5)
# use f_classif (the default) and SelectPercentile to select 10% of features:
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
# transform training set:
X_train_selected = select.transform(X_train)

print(X_train.shape)
print(X_train_selected.shape)









    



(284, 80)
(284, 40)



In [104]:

    
from sklearn.feature_selection import f_classif, f_regression, chi2



In [105]:

    
F, p = f_classif(X_train, y_train)



In [106]:

    
plt.figure()
plt.plot(p, 'o')









    Out[106]:





[<matplotlib.lines.Line2D at 0x7f7941481ac8>]



In [107]:

    
mask = select.get_support()
print(mask)
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')









    



[ True  True  True  True  True  True  True  True  True False  True False
  True  True  True  True  True  True False False  True  True  True  True
  True  True  True  True  True  True False False False  True False  True
 False False  True False False False False  True False False  True False
 False  True False  True False False False False False False  True False
  True False False False False  True False  True False False False False
  True  True False  True False False False False]






    Out[107]:





<matplotlib.image.AxesImage at 0x7f79415dddd8>



In [108]:

    
from sklearn.linear_model import LogisticRegression

# transform test data:
X_test_selected = select.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Score with all features: %f" % lr.score(X_test, y_test))
lr.fit(X_train_selected, y_train)
print("Score with only selected features: %f" % lr.score(X_test_selected, y_test))









    



Score with all features: 0.929825
Score with only selected features: 0.940351

Model-based Feature Selection



In [109]:

    
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="median")



In [110]:

    
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print(X_train.shape)
print(X_train_l1.shape)









    



(284, 80)
(284, 40)



In [111]:

    
mask = select.get_support()
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')









    Out[111]:





<matplotlib.image.AxesImage at 0x7f79414e92b0>



In [112]:

    
X_test_l1 = select.transform(X_test)
LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)









    Out[112]:





0.9508771929824561

Recursive Feature Elimination



In [113]:

    
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40)
#select = RFE(LogisticRegression(penalty="l1"), n_features_to_select=40)

select.fit(X_train, y_train)
# visualize the selected features:
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')









    Out[113]:





<matplotlib.image.AxesImage at 0x7f7941425898>



In [114]:

    
X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)

LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test)









    Out[114]:





0.9508771929824561



In [115]:

    
select.score(X_test, y_test)









    Out[115]:





0.9508771929824561

Sequential Feature Selection



In [116]:

    
from mlxtend.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(LogisticRegression(), k_features=40, 
                                 forward=True, scoring='accuracy',cv=5)
sfs = sfs.fit(X_train, y_train)









    



Features: 40/40



In [117]:

    
mask = np.zeros(80, dtype='bool')
mask[np.array(sfs.k_feature_idx_)] = True



In [118]:

    
plt.matshow(mask.reshape(1, -1), cmap='gray_r')









    Out[118]:





<matplotlib.image.AxesImage at 0x7f7941404b70>



In [59]:

    
LogisticRegression().fit(sfs1.transform(X_train), y_train).score(sfs.transform(X_test), y_test)









    Out[59]:





0.93684210526315792

Exercises

Choose either the Boston housing dataset or the adult dataset from above. Compare a linear model with interaction features against one without interaction features. Use feature selection to determine which interaction features were most important.



In [123]:

    
data = pd.read_csv("data/adult.data", header=None, index_col=False,
                   names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
                          'marital-status', 'occupation', 'relationship', 'race', 'gender',
                          'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
y = data.income.values
X = pd.get_dummies(data.drop("income", axis=1))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scaler = MinMaxScaler().fit(X_train)
X_train_ = scaler.transform(X_train)
X_test_ = scaler.transform(X_test)



In [124]:

    
LogisticRegression().fit(X_train_, y_train).score(X_test_, y_test)









    Out[124]:





0.84842156983171602



In [125]:

    
X_train.shape









    Out[125]:





(24420, 108)



In [178]:

    
select = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold="5 * median")
X_train_selected = select.fit_transform(X_train_, y_train)
X_test_selected = select.transform(X_test_)



In [179]:

    
LogisticRegression().fit(X_train_selected, y_train).score(X_test_selected, y_test)









    Out[179]:





0.84485935388772881



In [180]:

    
X_train_selected.shape









    Out[180]:





(24420, 20)



In [197]:

    
poly = PolynomialFeatures(degree=2).fit(X_train_selected)
X_train_selected_poly = poly.transform(X_train_selected)
X_test_selected_poly = poly.transform(X_test_selected)



In [200]:

    
lr = LogisticRegression(C=0.01, penalty="l1").fit(X_train_selected_poly, y_train)
lr.score(X_test_selected_poly, y_test)









    Out[200]:





0.82213487286574127



In [201]:

    
np.array(poly.get_feature_names(X.columns[select.get_support()]))[lr.coef_.ravel() != 0]









    Out[201]:





array(['1', 'capital-gain', 'occupation_ Exec-managerial',
       'occupation_ Prof-specialty', 'relationship_ Own-child',
       'age gender_ Male', 'education-num^2',
       'education-num marital-status_ Married-civ-spouse',
       'hours-per-week marital-status_ Married-civ-spouse',
       'workclass_ Private education_ Bachelors',
       'workclass_ Private marital-status_ Never-married',
       'marital-status_ Never-married^2', 'occupation_ Exec-managerial^2',
       'occupation_ Prof-specialty^2', 'relationship_ Own-child^2',
       'relationship_ Wife gender_ Female', 'gender_ Female^2'], 
      dtype='<U64')



In [ ]:

	age	workclass	education	gender	hours-per-week	occupation	income
0	39	State-gov	Bachelors	Male	40	Adm-clerical	<=50K
1	50	Self-emp-not-inc	Bachelors	Male	13	Exec-managerial	<=50K
2	38	Private	HS-grad	Male	40	Handlers-cleaners	<=50K
3	53	Private	11th	Male	40	Handlers-cleaners	<=50K
4	28	Private	Bachelors	Female	40	Prof-specialty	<=50K

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0.0	0.0	1.0	...	0.0	1.0
1	50	13	0.0	1.0	0.0	...	0.0	1.0
2	38	40	1.0	0.0	0.0	...	0.0	1.0
3	53	40	1.0	0.0	0.0	...	0.0	1.0
4	28	40	1.0	0.0	0.0	...	1.0	1.0

	Categorical Feature_box	Categorical Feature_fox	Categorical Feature_socks	Integer Feature_0	Integer Feature_1	Integer Feature_2
0	0.0	0.0	1.0	1.0	0.0	0.0
1	0.0	1.0	0.0	0.0	1.0	0.0
2	0.0	0.0	1.0	0.0	0.0	1.0
3	1.0	0.0	0.0	0.0	1.0	0.0

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0.0	0.0	1.0	...	0.0	1.0
1	50	13	0.0	1.0	0.0	...	0.0	1.0
2	38	40	1.0	0.0	0.0	...	0.0	1.0
3	53	40	1.0	0.0	0.0	...	0.0	1.0
4	28	40	1.0	0.0	0.0	...	1.0	1.0

	Categorical Feature_box	Categorical Feature_fox	Categorical Feature_socks	Integer Feature_0	Integer Feature_1	Integer Feature_2
0	0.0	0.0	1.0	1.0	0.0	0.0
1	0.0	1.0	0.0	0.0	1.0	0.0
2	0.0	0.0	1.0	0.0	0.0	1.0
3	1.0	0.0	0.0	0.0	1.0	0.0

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0.0	0.0	1.0	...	0.0	1.0
1	50	13	0.0	1.0	0.0	...	0.0	1.0
2	38	40	1.0	0.0	0.0	...	0.0	1.0
3	53	40	1.0	0.0	0.0	...	0.0	1.0
4	28	40	1.0	0.0	0.0	...	1.0	1.0

	Categorical Feature_box	Categorical Feature_fox	Categorical Feature_socks	Integer Feature_0	Integer Feature_1	Integer Feature_2
0	0.0	0.0	1.0	1.0	0.0	0.0
1	0.0	1.0	0.0	0.0	1.0	0.0
2	0.0	0.0	1.0	0.0	0.0	1.0
3	1.0	0.0	0.0	0.0	1.0	0.0